from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">HERE</a>.''')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import NearMiss
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
The goal of this project is to try different algorithms in order to understand how accurate they are in detecting whether a client will do a credit card default.
Banks have to carefully check customers in order to avoid financial losses.
To do that they need to be able to understand if a customer will present a default credit card status.
So for this task a good classifier is the one which is able to correctly classify a default payment, since it represents the financial loss of the bank.
Obviously also the non default clients have to be correctly classified, otherwise the bank will lose them as clients.
The dataset used to complete this task is default of credit card clients from UCI machine learning repository.
It contains 30000 instances, each one describes a client of a Taiwan bank, taking into account different attributes (24 precisely).
df=pd.read_excel("credit_card.xls",index_col=0)
column_name=df.iloc[0]
credit_card = df[1:].copy()
credit_card.columns = column_name
credit_card=credit_card.astype("int64")
lista=credit_card.isnull().sum(axis = 0)
missing=False
for el in lista:
if el!=0:
missing=True
if missing:
print("ATTENTION: there are missing values!")
else:
print("There are not missing values.")
print(f"Value in SEX attribute column: {set(credit_card['SEX'])}")
print(f"Value in EDUCATION attribute column: {set(credit_card['EDUCATION'])}")
print(f"Value in MARRIAGE attribute column: {set(credit_card['MARRIAGE'])}")
print(f"Value in RESPONSE (YES or NO) column: {set(credit_card['default payment next month'])}")
print(f"Value in PAY_0 column: {set(credit_card['PAY_0'])}")
print(f"Value in PAY_2 column: {set(credit_card['PAY_2'])}")
print(f"Value in PAY_3 column: {set(credit_card['PAY_3'])}")
print(f"Value in PAY_4 column: {set(credit_card['PAY_4'])}")
print(f"Value in PAY_5 column: {set(credit_card['PAY_5'])}")
print(f"Value in PAY_6 column: {set(credit_card['PAY_6'])}")
So looking at the values present in the attributes printed before some changes have to be done:
credit_card["MARRIAGE"]=credit_card["MARRIAGE"].replace(0,3)
credit_card["EDUCATION"]=credit_card["EDUCATION"].replace([0,5,6],4)
credit_card["PAY_0"]=credit_card["PAY_0"].replace([-2,-1],0)
credit_card["PAY_2"]=credit_card["PAY_2"].replace([-2,-1],0)
credit_card["PAY_3"]=credit_card["PAY_3"].replace([-2,-1],0)
credit_card["PAY_4"]=credit_card["PAY_4"].replace([-2,-1],0)
credit_card["PAY_5"]=credit_card["PAY_5"].replace([-2,-1],0)
credit_card["PAY_6"]=credit_card["PAY_6"].replace([-2,-1],0)
credit_card.describe()
It's possible to see how for example:
x=np.arange(2)
yes=credit_card[credit_card["default payment next month"]==1].shape[0]
no=credit_card[credit_card["default payment next month"]==0].shape[0]
y=[yes,no]
fig,ax=plt.subplots(figsize=(15,6))
ax.set_title("DEFAULT PAYMENT",fontsize=15)
ax.set_ylabel("Number of customers",fontsize=10)
plt.xticks(x,("YES","NO"),fontsize=10)
plt.bar(x,y,color=["red","green"])
tot=yes+no
for i in range(len(x)):
ax.annotate((y[i],("%.2f "%((y[i])/tot*100))+"%"),(x[i],y[i]),xytext=(0,3),textcoords="offset points",ha="center",va="bottom",fontsize=11)
plt.show()
How it's possible to see the two classes (default and no default) are highly unbalanced.
From this some considerations could be done:
Since the dataset is not so small, by adopting an undersampling strategy should not be lost a lot of information.
At the same time by oversampling the smaller class this allows to not loose any information, but in this case synthetic data are used, so this should be taken into account.
It's not so easy to understand what should be done, in most of the cases the best practice is the one to ask to a domain expert what we should do.
def plot0(colonna):
yes=credit_card[credit_card["default payment next month"]==1]
no=credit_card[credit_card["default payment next month"]==0]
distinct_amount_YES=set(yes[colonna])
lista_amount_YES=sorted(distinct_amount_YES)
count_per_limit_YES=[]
conti_YES=yes[colonna].value_counts()
for amount in distinct_amount_YES:
count_per_limit_YES.append(conti_YES[amount])
distinct_amount_NO=set(no[colonna])
lista_amount_NO=sorted(distinct_amount_NO)
count_per_limit_NO=[]
conti_NO=no[colonna].value_counts()
for amount in distinct_amount_NO:
count_per_limit_NO.append(conti_NO[amount])
fig, ax = plt.subplots(figsize=(11.5,4))
ax.scatter(lista_amount_YES, count_per_limit_YES, c="red", label="default payment = YES")
ax.scatter(lista_amount_NO,count_per_limit_NO, c="green", label="default payment = NO")
ax.set_title(colonna,fontsize=12)
ax.set_ylabel("Number of customers",fontsize=10)
ax.legend(fontsize=12)
plt.show()
Distributions of LIMIT BAL, AGE, SEX, EDUCATION and MARRIAGE attributes according to the value of default payment:
plot0("LIMIT_BAL")
plot0("AGE")
def plot2(colonna,nomi):
possible_values=set(credit_card[colonna])
x=np.arange(len(possible_values))
y_tot=[]
for value in possible_values:
count=credit_card[credit_card[colonna]==value]
y_tot.append(count)
y_yes=[]
y_no=[]
for category in y_tot:
yes=category[category["default payment next month"]==1].shape[0]
no=category[category["default payment next month"]==0].shape[0]
y_yes.append(yes)
y_no.append(no)
width = 0.35
fig,ax=plt.subplots(figsize=(10,4))
rects1 = ax.bar(x - width/2, y_yes, width, label="default payment = YES",color="red")
rects2 = ax.bar(x + width/2, y_no, width, label="default payment = NO",color="green")
ax.set_title(colonna,fontsize=12)
ax.set_ylabel("Number of customers",fontsize=10)
ax.set_xticks(x)
ax.set_xticklabels(nomi,fontsize=10)
ax.legend()
def autolabel(rects):
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3),textcoords="offset points",ha='center', va='bottom',fontsize=8)
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
plot2("SEX",["male","female"])
plot2("EDUCATION",["graduate school","university","high school","others"])
plot2("MARRIAGE",["married","single","others"])
General distribution of SEX, EDUCATION and MARRIAGE attributes:
def plot1(colonna,nomi):
x=np.arange(len(set(credit_card[colonna])))
possible_values=set(credit_card[colonna])
y=[]
for value in possible_values:
count=credit_card[credit_card[colonna]==value].shape[0]
y.append(count)
fig,ax=plt.subplots(figsize=(11.5,4))
ax.set_title(colonna,fontsize=12)
ax.set_ylabel("Number of customers",fontsize=10)
if colonna=="SEX":
colori=["cyan","pink"]
else:
colori=["yellow","blue","magenta","orange"]
colori=colori[:len(nomi)]
plt.bar(x,y,color=colori)
plt.xticks(x,(nomi),fontsize=10)
for i in range(len(x)):
ax.annotate(y[i],(x[i],y[i]),xytext=(0,3),textcoords="offset points",ha="center",va="bottom",fontsize=8)
plt.show()
plot1("SEX",["male","female"])
plot1("EDUCATION",["graduate school","university","high school","others"])
plot1("MARRIAGE",["married","single","others"])
This representation is very useful to understand the relationship between the dimensions of the dataset when they are more than 3 (since scatter plot works only in 2D and 3D cases).
plot=sn.pairplot(credit_card,vars=["LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE"],hue="default payment next month",palette=["green","red"])
credit_card_dummies=credit_card
credit_card_dummies=pd.concat([credit_card,pd.get_dummies(credit_card["SEX"],prefix="SEX")],axis=1)
credit_card_dummies=credit_card_dummies.drop(["SEX"],axis=1)
credit_card_dummies=pd.concat([credit_card_dummies,pd.get_dummies(credit_card["EDUCATION"],prefix="EDUCATION")],axis=1)
credit_card_dummies=credit_card_dummies.drop(["EDUCATION"],axis=1)
credit_card_dummies=pd.concat([credit_card_dummies,pd.get_dummies(credit_card["MARRIAGE"],prefix="MARRIAGE")],axis=1)
credit_card_dummies=credit_card_dummies.drop(["MARRIAGE"],axis=1)
Correlation matrix, as pairwise scatterplots, allows to understand the relationships among attributes; and thanks to correlation coefficients a precise measure of these relations is given.
Correlation coefficients are calculated throught Pearson's method, and so according to the following formula:
$\rho_{X,Y}=\frac{cov(X,Y)}{\sigma_{X}\sigma_{Y}} $
where $cov(X,Y)$ is the covariance between X and Y, while $\sigma_{X}$ is the standard deviation of X and $\sigma_{Y}$ is the standard deviation of Y.
For the sake of completeness:
The interpretation of the obtained coefficient is the following:
Note that to properly understand the correlations among attributes, categorical attributes like marriage, sex and education must be converted into dummy variables.
fig,ax=plt.subplots(figsize=(30,30))
corrMatrix = credit_card_dummies.corr()
sn.heatmap(corrMatrix,annot=True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()
The last step of data exploration is looking for outliers.
This step is very important because they could affects the performances of the model (in particular of those models which use equations).
Sometimes outliers can be legitimate extreme values, but could happens that they are the results of bad data collection or misdialed values (for example a negative price).
In the latter case they have to be removed.
A very useful tool to detect possible outliers is the box plot:
box plot is a way of displaying the dataset based on a five-number summary:
Here are analyze the boxplots of numerical features; at this step any standardization is applied in order to understand if a possible outlier has a reasonable value or not:
study_possible_outlier=credit_card[['LIMIT_BAL','AGE', 'BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].copy()
fig, ax = plt.subplots(figsize=(6,4))
ax.set_title('LIMIT_BAL Box plot')
data=[study_possible_outlier['LIMIT_BAL']]
ax.boxplot(data)
plt.show()
Looking at this box plot there doesn't seem to be outliers:
fig, ax = plt.subplots(figsize=(6,4))
ax.set_title('AGE Box plot')
data=[study_possible_outlier['AGE']]
ax.boxplot(data)
plt.show()
Also in this case there doesn't seem to be outliers:
fig, ax = plt.subplots(figsize=(6,4))
ax.set_title('BILL AMT Box plot')
data=[study_possible_outlier['BILL_AMT1'],study_possible_outlier['BILL_AMT2'],study_possible_outlier['BILL_AMT3'],study_possible_outlier['BILL_AMT4'],study_possible_outlier['BILL_AMT5'],study_possible_outlier['BILL_AMT6'],]
ax.boxplot(data)
plt.show()
This attribute represents the bill statement and it's not so easy to understand if these are outliers in the sense of wrong data collection or they are simply extreme values:
fig, ax = plt.subplots(figsize=(6,4))
ax.set_title('PAY AMT Box plot')
data=[study_possible_outlier['PAY_AMT1'],study_possible_outlier['PAY_AMT2'],study_possible_outlier['PAY_AMT3'],study_possible_outlier['PAY_AMT4'],study_possible_outlier['PAY_AMT5'],study_possible_outlier['PAY_AMT6']]
ax.boxplot(data)
plt.show()
This attribute represents the amount of the previous payments:
It can be also very useful to visualize the boxplot of all the numerical features together; remembering to normalize them otherwise the representation is meaningless.
study_possible_outlier_transformed = Normalizer().fit_transform(study_possible_outlier)
study_possible_outlier_transformed_df=pd.DataFrame(study_possible_outlier_transformed,columns=study_possible_outlier.columns)
fig, ax = plt.subplots(figsize=(15,10))
ax.grid()
ax = sn.boxplot(data=study_possible_outlier_transformed_df)
It's possible to see that the biggest variation is visible among PAY_AMTi features, instead the others are more stable.
The method previously seen is useful to detect outliers in a univariate way: looking at one feature at time, but it could be useful to try to detect possible outliers by considering all the features at the same time.
Isolation forest allows to do it: the main idea is that outliers are few and far from others observation.
df_isolation_data=credit_card_dummies[['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5','PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4','BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6','SEX_1', 'SEX_2', 'EDUCATION_1', 'EDUCATION_2', 'EDUCATION_3','EDUCATION_4', 'MARRIAGE_1', 'MARRIAGE_2', 'MARRIAGE_3']].copy()
isolationForestDetector = IsolationForest(random_state=0).fit(df_isolation_data)
scores=isolationForestDetector.predict(df_isolation_data)
classiTot=credit_card_dummies["default payment next month"]
index_to_eliminate=[]
conto=0
for s in scores:
if s==-1:
classe=classiTot.iloc(0)[conto]
if classe==0:
index_to_eliminate.append(conto)
conto=conto+1
credit_card_dummies.drop(credit_card_dummies.index[index_to_eliminate],inplace=True)
To properly evaluate the algorithm the entire dataset has to be divided into train and test set before applying any preprocessing step and before also undersampling or oversampling techniques, otherwise the result will be meaningless since it's evaluated on a different data distribution.
To divide the original dataset in the training and testing part is used "train_test_split" method of the library sklearn, which is very useful because it allows to specify if data have to be split in a stratified fashion: so in this case data are splitting in order to mantain the same proportion of YES and NO default.
df_train,df_test=train_test_split(credit_card_dummies,train_size=0.80,shuffle=True,stratify=credit_card_dummies['default payment next month'],random_state=42)
label_train=df_train["default payment next month"].values.tolist()
df_train=df_train.drop(columns=["default payment next month"])
label_test=df_test["default payment next month"].values.tolist()
df_test=df_test.drop(columns=["default payment next month"])
When data has a large number of features it is often useful to look for a lower-dimensional representation which preserves most of its properties, the most widely used techniques to do this are PCA and SVD.
Thinking about PCA (explained better below): its aim is to find the components which maximize the variance, but if one feature varies less than another (because of the scale for example) PCA might determine that the direction of maximal variance more closely corresponds with the feature axis having the highest variance, even if it's not correct.
So scaling is an important step to do in order to be sure that a dimensionality reduction technique works in a proper way.
Principal Component Analysis.
So now the PCA is applied only on numerical attributes, and not to the ones which in origin were categorical attributes.
To understand which is a good number of components to use it has to be looked the graph of exaplained variance.
In this case the number of components choosen is the one which allows to reach an explained variance of 80%.
features = ['LIMIT_BAL', 'AGE', 'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6','PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5','PAY_AMT6']
scaler = StandardScaler()
scaler.fit(df_train[features])
X = scaler.transform(df_train[features])
pca = PCA(n_components = 20)
pca.fit(X)
cum_variance = np.cumsum(pca.explained_variance_ratio_)
idx1 = np.argmax(cum_variance > .80)
value=cum_variance[idx1]
x_plot=np.linspace(1,len(cum_variance),len(cum_variance))
fig,ax=plt.subplots(figsize=(15,6))
ax.set_title("Explained variance",fontsize=15)
ax.set_ylabel("Explained variance ratio",fontsize=10)
ax.set_xlabel("Number of components",fontsize=10)
plt.xticks(np.arange(1,21))
ax.plot(x_plot,cum_variance)
plt.scatter(idx1+1,value)
string=f"{idx1+1} components"
ax.annotate(string,(idx1,value),xytext=(-20,10),textcoords="offset points",ha="center",va="bottom",fontsize=15)
ax.grid(True)
plt.savefig("reviews_train.png")
plt.show()
features = ['LIMIT_BAL', 'AGE', 'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6','PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5','PAY_AMT6']
scaler = StandardScaler()
scaler.fit(df_train[features])
X_train = scaler.transform(df_train[features])
X_test = scaler.transform(df_test[features])
pca = PCA(n_components = 9)
transformed_train=pca.fit_transform(X_train)
transformed_test=pca.fit_transform(X_test)
for count in range(0,9):
stringa="PCA_"+str(count+1)
df_train[stringa]=transformed_train[:,count]
df_test[stringa]=transformed_test[:,count]
df_train=df_train.drop(columns=['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5','PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4','BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])
df_test=df_test.drop(columns=['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5','PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4','BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3','PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'])
Cross validation is a widely used approach for estimating test error (which is the average error that results from using a statistical learning method to predict the target on a new observation, that was not present in the training set).
When undersampling or oversampling techniques are used, cross validation has to be used carefully, because if those techniques are done before cross validation, all this will cause data leakage problem since the probability density function is altered.
For this reason an ad-hoc process is built:
The score used to select the best configuration for each classifier is f1_score, according to the task.
The first technique used to deal with the unbalance dataset is undersampling, in particular Random undersampling.
This method consists in remove samples from the majority class randomly.
It must taken into account that it may discard useful samples.
Pratically, from the points belonging to the majority class, a random sample is taken (without replacement) in order to have a set with the same cardinality of the smallest class.
def undersampling(x,y):
tot=x.copy()
tot["label"]=y
yes=tot[tot["label"]==1]
no=tot[tot["label"]==0].sample(n=len(yes),random_state=42)
final=pd.concat([yes,no])
label=final["label"].values.tolist()
data=final.drop(["label"],axis=1)
return data,label
def oversampling(x,y):
data_old=x.copy()
label_old=y
sm=SMOTE(sampling_strategy="minority",random_state=42)
data,label=sm.fit_resample(data_old,label_old)
return data,label
def my_k_fold(configuration,algorithm,typology):
f1_scores=[]
configurations=[]
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
for configuration in ParameterGrid(configuration):
configurations.append(configuration)
f1=[]
for train_indices,validation_indices in skf.split(df_train,label_train):
x_train=df_train.iloc[train_indices]
y_train=np.array(label_train)[train_indices]
if typology=="undersampling":
x_train,y_train=undersampling(x_train,y_train)
else:
x_train,y_train=oversampling(x_train,y_train)
x_validation=df_train.iloc[validation_indices]
y_validation=np.array(label_train)[validation_indices]
classifier=algorithm(**configuration)
classifier.fit(x_train,y_train)
prediction=classifier.predict(x_validation)
fscore=f1_score(y_validation,prediction)
f1.append(fscore)
mean_f1=np.average(f1)
f1_scores.append(mean_f1)
maximum=max(f1_scores)
if len(np.where(f1_scores==maximum)[0].tolist())>1:
index=np.where(f1_scores==maximum)[0].tolist()[0]
else:
index=np.where(f1_scores==maximum)[0].item()
configuration=configurations[index]
return configuration,maximum,configurations,f1_scores
def plot_all(classifier,classifier_name,best_configuration,best_value,all_configurations,all_values,method):
classifier=classifier(**best_configuration)
if method=="undersampling":
x_train,y_train=undersampling(df_train,label_train)
else:
x_train,y_train=oversampling(df_train,label_train)
classifier.fit(x_train,y_train)
prediction=classifier.predict(df_test)
precision,recall,fscore,support=precision_recall_fscore_support(label_test,prediction)
acc=accuracy_score(label_test,prediction)
true_positive=0
true_negative=0
false_positive=0
false_negative=0
for i in range(len(prediction)):
if prediction[i]==1:
if label_test[i]==1:
true_positive=true_positive+1
else:
false_positive=false_positive+1
else:
if label_test[i]==0:
true_negative=true_negative+1
else:
false_negative=false_negative+1
altosinistra=true_negative/(false_positive+true_negative)
altodestra=1-altosinistra
bassodestra=true_positive/(false_negative+true_positive)
bassosinistra=1-bassodestra
z=np.array([bassosinistra,bassodestra,altosinistra,altodestra])
z.resize(2,2)
z_text = np.around(z, decimals=2) # Only show rounded value (full value on hover)
str_precision=str(round(precision[0],3))+" - "+str(round(precision[1],3))
str_recall=str(round(recall[0],3))+" - "+str(round(recall[1],3))
str_fscore=str(round(fscore[0],3))+" - "+str(round(fscore[1],3))
str_accuracy=str(round(acc,3))
values=[["Precision","Recall","Fscore","Accuracy"],[str_precision,str_recall,str_fscore,str_accuracy]]
all_c=[]
all_v=[]
for i in range(len(all_configurations)):
all_c.append(str(all_configurations[i]))
all_v.append(all_values[i])
values_all_configuration=[all_c,all_v]
fig1 = ff.create_annotated_heatmap(z,x=[0,1],y=[1,0],annotation_text=z_text, colorscale='Blues',hoverinfo='z')
for i in range(len(fig1.data)):
fig1.data[i].xaxis='x1'
fig1.data[i].yaxis='y1'
fig1.layout.xaxis1.update(side="bottom",title="Predicted values")
fig1.layout.yaxis1.update(side="bottom",title="True values")
fig2 = go.Figure(data=[go.Table(cells=dict(values=["RESULTS OBTAINED WITH THE BEST CONFIGURATION FOUND, metrics summarization and confusion matrix"],line_color='white',
fill_color='white',))])
if len(all_configurations)==0:
fig = make_subplots(
rows=4, cols=1,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[ [{"type": "table"}],[{"type": "scatter"}],[{"type": "table"}],[{"type": "table"}],
],
)
fig.add_trace(
go.Table(
header=dict(
values=["Metrics","Non Default - Default"],
font=dict(size=10),
align="left"
),
cells=dict(
values=values,
align = "left")
),
row=1, col=1
)
fig.add_trace(
fig1.data[0],
row=2, col=1
)
fig.update_layout(
height=800,
showlegend=False,
title_text=f"{classifier_name} Results",
)
fig.layout.update(fig1.layout)
fig.show()
return recall
fig = make_subplots(
rows=5, cols=1,
shared_xaxes=True,
vertical_spacing=0.03,
specs=[[{"type": "table"}],
[{"type": "table"}],
[{"type": "table"}],
[{"type": "table"}],
[{"type": "scatter"}],
],
)
fig.add_trace(
go.Table(
header=dict(
values=["All configurations tried","Fscore"],
font=dict(size=10),
align="left"
),
cells=dict(
values=values_all_configuration,
align = "left")
),
row=1, col=1
)
fig.add_trace(
go.Table(
header=dict(
values=["Best configuration","Fscore"],
font=dict(size=10),
align="left"
),
cells=dict(
values=[str(best_configuration),best_value],
align = "left")
),
row=2, col=1
)
fig.add_trace(
fig2.data[0],
row=3, col=1
)
fig.add_trace(
go.Table(
header=dict(
values=["Metrics","Non Default - Default"],
font=dict(size=10),
align="left"
),
cells=dict(
values=values,
align = "left")
),
row=4, col=1
)
fig.add_trace(
fig1.data[0],
row=5, col=1
)
fig.update_layout(
height=800,
showlegend=False,
title_text=f"{classifier_name} Results",
)
fig.layout.update(fig1.layout)
fig.layout.update(fig2.layout)
fig.show()
return recall
Random forest is an esemble of decision trees

Random forest construction:
Random forest classification:
Strong and weak point:
Hyperparameters selection:
param_grid_random_forest={"n_estimators":[100,300,500],"criterion":["gini","entropy"],"max_depth":[None,10]}
configuration_random_forest,best_rf,configurations_rf,f1_scores_rf=my_k_fold(param_grid_random_forest,RandomForestClassifier,"undersampling")
recall_random_forest_undersampling=plot_all(RandomForestClassifier,"Random Forest",configuration_random_forest,best_rf,configurations_rf,f1_scores_rf,"undersampling")
Logistic regression is a statistical method for predicting binary classes, namely it's a special case of linear regression where the targets are categorical and only two classes are present: response variable $Y$ is Bernoulli (if there are more classes it's the so called multinomial logistic regression) .
The difference with linear regression is that Sigmoid function is used as cost function in order to limit it between 0 and 1.
The Sigmoid function is used to map predictions to probabilities.
$f(X)=\frac{1}{1+e^{-(\beta_{0}+\beta_{1}X)}}$
Then the log odds are computed and is obtained the following equation:
$log \frac{f(X)}{1-f(X)}= \beta_{0}+\beta_{1}X$
In order to minimize the error,$\beta_{i}$ coefficients are estimated through the Maximum Likelihood Estimation
$l(\beta_{0},\beta_{1})=\prod_{i:y_{i}=1}p(x_{i})\prod_{i:y_{i}=0}(1-p(x_{i}))$
Once $\beta_{i}$ have been estimated, it's enough to compute the probabilities, and the record is assigned to the class with the highest probability.
What mentioned above is about problems where there is a single predictor, instead if there are more predictors the log odds have to be generalized:
$log \frac{f(X)}{1-f(X)}= \beta_{0}+\beta_{1}X+...+\beta_{p}X_{p}$
since $f(X)=\frac{1}{1+e^{-(\beta_{0}+\beta_{1}X+...+\beta_{p}X_{p})}}$
Hyperparameters selection:
param_grid_logistic_regression={"penalty":["l2"],"C":[0.1,1,10,100],"tol":[1e-5,1e-4,1e-3]}
configuration_logistic_regression,best_lr,configurations_lr,f1_scores_lr=my_k_fold(param_grid_logistic_regression,LogisticRegression,"undersampling")
recall_logistic_regression_undersampling=plot_all(LogisticRegression,"Logistic Regression",configuration_logistic_regression,best_lr,configurations_lr,f1_scores_lr,"undersampling")
Support Vector Machine aims to find the hyperplane that separates in the best way the classes in the feature space.
The general equation for a hyperplane is:
$\beta_{0}+\beta_{1}X_{1}+...+\beta_{p}X_{p}=0 $
HARD MARGIN
SOFT MARGIN
KERNEL TRICK
SVM is optimal in terms of minimizing the risk of making mistakes:
Hyperparameters selection:
param_grid_linear_SVM={"kernel":["linear"],"C":[0.1,1,10]}
configuration_linear_SVM,best_linear_SVM,configurations_svm,f1_scores_svm=my_k_fold(param_grid_linear_SVM,SVC,"undersampling")
recall_linear_SVM_undersampling=plot_all(SVC,"Linear SVM",configuration_linear_SVM,best_linear_SVM,configurations_svm,f1_scores_svm,"undersampling")
param_grid_RBF_SVM={"kernel":["rbf"],"C":[0.1,1,10],"gamma":["auto","scale"]}
configuration_RBF_SVM,best_svm_rbf,configurations_svm_rbf,f1_scores_svm_rbf=my_k_fold(param_grid_RBF_SVM,SVC,"undersampling")
recall_RBF_SVM_undersampling=plot_all(SVC,"Rbf SVM",configuration_RBF_SVM,best_svm_rbf,configurations_svm_rbf,f1_scores_svm_rbf,"undersampling")
param_grid_POLY_SVM={"kernel":["poly"],"degree":[2,3],"C":[0.1,1,10],"gamma":["auto","scale"]}
configuration_POLY_SVM,best_svm_poly,configurations_svm_poly,f1_scores_svm_poly=my_k_fold(param_grid_POLY_SVM,SVC,"undersampling")
recall_POLY_SVM_undersampling=plot_all(SVC,"Poly SVM",configuration_POLY_SVM,best_svm_poly,configurations_svm_poly,f1_scores_svm_poly,"undersampling")
LDA, Linear Discriminant Analysis, is the generalization of Fisher's Discriminant Analysis.
It's similar to Logistic regression, but instead of modeling $Pr(Y=k|X=x)$ using the logistic function, LDA computes the distribution of the predictors X separately for each Y class, and then thank to Bayes' theorem it finds $Pr(Y=k|X=x)$.
It looks for finding a linear combination of features that separates two or more classes, events. The resulting combination can be used as a linear classifier or for dimensionality reduction.
It's based on some assumptions:
Differently from Gaussian NB, no independence assumptions are done over the features.
Example:
Assuming that it's known how a n dimensional vector X vaires in two populations: Y (stands for default), N (for no default):
param_grid_LDA={"tol":[1e-5,1e-4,1e-3,1e-2]}
configuration_LDA,best_lda,configurations_lda,f1_scores_lda=my_k_fold(param_grid_LDA,LinearDiscriminantAnalysis,"undersampling")
recall_LDA_undersampling=plot_all(LinearDiscriminantAnalysis,"Linear Discriminant Analysis",configuration_LDA,best_lda,configurations_lda,f1_scores_lda,"undersampling")
This is an algorithm which applies Bayes' theorem with strong feature independence assumptions.
Strong and weak points:
recall_GAUSSIAN_undersampling=plot_all(GaussianNB,"Gaussian NB",{},{},{},{},"undersampling")
The second technique used to deal with the unbalance dataset is oversampling, in particular SMOTE.
SMOTE consists in creating synthetic points from the smallest class in order to obtain a balancing between the minority and majority class.
Explanation
param_grid_random_forest={"n_estimators":[100,300,500],"criterion":["gini","entropy"],"max_depth":[None,10]}
configuration_random_forest_over,best_rf_over,configurations_rf_over,f1_scores_rf_over=my_k_fold(param_grid_random_forest,RandomForestClassifier,"oversampling")
recall_random_forest_oversampling=plot_all(RandomForestClassifier,"Random Forest",configuration_random_forest_over,best_rf_over,configurations_rf_over,f1_scores_rf_over,"oversampling")
param_grid_logistic_regression={"penalty":["l2"],"C":[0.1,1],"tol":[1e-5,1e-4,1e-3],"max_iter":[500]}
configuration_logistic_regression_over,best_lr_over,configurations_lr_over,f1_scores_lr_over=my_k_fold(param_grid_logistic_regression,LogisticRegression,"oversampling")
recall_logistic_regression_oversampling=plot_all(LogisticRegression,"Logistic Regression",configuration_logistic_regression_over,best_lr_over,configurations_lr_over,f1_scores_lr_over,"oversampling")
param_grid_linear_SVM={"kernel":["linear"],"C":[0.1,1,10],"max_iter":[500000]}
configuration_linear_SVM_over,best_linear_over,configurations_linear_over,f1_scores_linear_over=my_k_fold(param_grid_linear_SVM,SVC,"oversampling")
recall_linear_SVM_oversampling=plot_all(SVC,"Linear SVM",configuration_linear_SVM_over,best_linear_over,configurations_linear_over,f1_scores_linear_over,"oversampling")
param_grid_RBF_SVM={"kernel":["rbf"],"C":[0.1,1,10],"gamma":["auto","scale"],"max_iter":[500000]}
configuration_RBF_SVM_over,best_RBF_over,configurations_RBF_over,f1_scores_RBF_over=my_k_fold(param_grid_RBF_SVM,SVC,"oversampling")
recall_RBF_SVM_oversampling=plot_all(SVC,"Rbf SVM",configuration_RBF_SVM_over,best_RBF_over,configurations_RBF_over,f1_scores_RBF_over,"oversampling")
param_grid_POLY_SVM={"kernel":["poly"],"degree":[2,3],"C":[0.1,1,10],"gamma":["auto","scale"],"max_iter":[500000]}
configuration_POLY_SVM_over,best_POLY_over,configurations_POLY_over,f1_scores_POLY_over=my_k_fold(param_grid_POLY_SVM,SVC,"oversampling")
recall_POLY_SVM_oversampling=plot_all(SVC,"Poly SVM",configuration_POLY_SVM_over,best_POLY_over,configurations_POLY_over,f1_scores_POLY_over,"oversampling")
LDA is the generalization of Fisher's Discriminant Analysis.
param_grid_LDA={"tol":[1e-5,1e-4,1e-3,1e-2]}
configuration_LDA_over,best_lda_over,configurations_lda_over,f1_scores_lda_over=my_k_fold(param_grid_LDA,LinearDiscriminantAnalysis,"oversampling")
recall_LDA_oversampling=plot_all(LinearDiscriminantAnalysis,"Linear Discriminant Analysis",configuration_LDA_over,best_lda_over,configurations_lda_over,f1_scores_lda_over,"oversampling")
recall_GAUSSIAN_oversampling=plot_all(GaussianNB,"Gaussian NB",{},{},{},{},"oversampling")
def plot_result(yes,nomi,kind):
x=np.arange(len(nomi))
y_yes=[]
y_no=[]
for el in yes:
y_yes.append(el[1])
y_no.append(el[0])
width = 0.35
fig,ax=plt.subplots(figsize=(20,8))
rects1 = ax.bar(x - width/2, y_no, width, label="TRUE NEGATIVE",color="green")
rects2 = ax.bar(x + width/2, y_yes, width, label="TRUE POSITIVE",color="red")
ax.set_title(f"Recall - {kind}",fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(nomi,fontsize=18)
ax.legend(loc=1)
def autolabel(rects):
for rect in rects:
height = rect.get_height()
ax.annotate('{:.2f}'.format(height),xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3),textcoords="offset points",ha='center', va='bottom',fontsize=15)
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
ax.grid(False)
plt.show()
undersampling_list=[recall_random_forest_undersampling,recall_logistic_regression_undersampling,recall_linear_SVM_undersampling,recall_RBF_SVM_undersampling,recall_POLY_SVM_undersampling,recall_GAUSSIAN_undersampling,recall_LDA_undersampling]
oversampling_list=[recall_random_forest_oversampling,recall_logistic_regression_oversampling,recall_linear_SVM_oversampling,recall_RBF_SVM_oversampling,recall_POLY_SVM_oversampling,recall_GAUSSIAN_oversampling,recall_LDA_oversampling]
name=["random_forest","logistic_regression","linear_SVM","RBF_SVM","POLY_SVM","GAUSSIAN","LDA"]
plot_result(undersampling_list,name,"UNDERSAMPLING")
plot_result(oversampling_list,name,"OVERSAMPLING")
Since the goal of the study is the one of finding the best performance model, namely the one which is able to correctly detect possible credit card default, F1 scores are taken into consideration.
The highest True Negative rate (no default correctly classified) is reached from Random Forest with oversampling: 0.90, but the corresponding True Positive rate (default correctly classified) is very low: 0.48. And this is not good for the bank because since the algorithm is not very good in detecting default, the financial loss will be high.
The highest True Positive rate is reached from Gaussian NB with oversampling: 0.60, but the corresponding True Negative rate is not satisfactory: 0.70, this means that a lot of non default clients will be classified as default clients and so may the bank won't grant them a loan, and so they will change bank: also this is a financial loss.
Of course the final decision should be taken with the bank, because it knows what is the best proportion between True Positive and True Negative rates, but in my opinion a good classifier could be Random Forest with undersampling, (tnr=0.85,tpr=0.56), or SVM with rbf kernel and oversampling, (tnr=0.80,tpr=0.56).
In fact both of them have good performance in terms of true negative and true positive rates.
[1]. "An Introduction to Statistical Learning, with Applications in R". James, Witten, Hastie, Tibshirani
[2]. https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
[3]. https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
[4]. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
[5]. https://en.wikipedia.org/wiki/Isolation_forest
[6]. https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
[7]. https://scikit-learn.org/stable/modules/cross_validation.html